Load all data

Load Libraries

library(ggsci)
library(tidyverse)
library(dplyr)
library(forcats)
library(reshape2)
library(stringr)
library(tidyr)
library(tibble)
library(sangerseqR)
library(DECIPHER)
library(Biostrings)
library(phangorn)
library(ape)
library(ggplot2)
library(ggtree)
library(patchwork)
library(bioseq)
library(kmer)
library(GUniFrac)
library(seqinr)
library(vegan)
library(corrplot)
library(ggrepel)

library(ggmsa)
library(dendextend)
library(usedist)

Load Symbiont Data

#here is the raw SymPortal Data output that contains data for ITS2 type profiles and DIVs associated with all three coral species
seqs <- read_tsv("20210612_marzonie/186_20211115_03_DBV_20211116T024440.seqs.absolute.abund_and_meta.txt") %>%
  mutate(plate = case_when(str_detect(sample_name, "Plate1") ~ "Plate1",
                           str_detect(sample_name, "Plate2") ~ "Plate2",
                           str_detect(sample_name, "Plate3") ~ "Plate3",
                           str_detect(sample_name, "Plate4") ~ "Plate4",
                           str_detect(sample_name, "Plate5") ~ "Plate5", 
                           str_detect(sample_name, "Plate6") ~ "Plate6", 
                           str_detect(sample_name, "Plate7") ~ "Plate7", 
                           TRUE ~ "Other"),
         position = str_sub(sample_name, start = 8, end = 11),
         plate_position = paste0(plate, "_", position)) %>%
  filter(!(is.na(sample_name))) %>%
  select(sample_name = plate_position, `A1`:`1275234_G`) %>%
  mutate(sample_name = as.factor(sample_name))

Load Metadata

meta = read.csv("Metadata.csv") %>% 
  mutate(sample_name = as.factor(sample_name))

Load Custom Functions

read_fasta_df <- function (file = "") {
  fasta <- readLines(file)
  ind <- grep(">", fasta)
  s <- data.frame(ind = ind, from = ind + 1, to = c((ind - 
    1)[-1], length(fasta)))
  seqs <- rep(NA, length(ind))
  for (i in 1:length(ind)) {
    seqs[i] <- paste(fasta[s$from[i]:s$to[i]], collapse = "")
  }
  tib <- tibble(label = gsub(">", "", fasta[ind]), sequence = seqs)
  return(tib)
}

write_fasta_df <- function (data, filename) 
{
    fastaLines = c()
    for (rowNum in 1:nrow(data)) {
        fastaLines = c(fastaLines, as.character(paste(">", 
            data[rowNum, "label"], sep = "")))
        fastaLines = c(fastaLines, as.character(data[rowNum, 
            "sequence"]))
    }
    fileConn <- file(filename)
    writeLines(fastaLines, fileConn)
    close(fileConn)
}

dna_to_DNAbin <- function (dna){
  DNAbin <- as_DNAbin(dna)
  names(DNAbin) <- names(dna)
  return(DNAbin)
}
dna_to_DNAStringset <- function(x) 
{
    bioseq:::check_dna(x)
    DNAstr <- DNAStringSet(paste(x))
    names(DNAstr) <- names(x)
    return(DNAstr)
}

DNAStringSet_to_dna <- function(x){
    x_dna <- as_dna(paste(x))
    names(x_dna) <- names(x)
    res <- tibble(label = names(x), sequence = x_dna)
    return(res)
}

# Convert DNAstringset to DNAbin
DNAStringSet_to_DNAbin <- function(DNAStringSet){
  DNAbin <- as.DNAbin(DNAStringSet)
  return(DNAbin)
}

# https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-018-2315-y
palette <- c("A" = "#46ff2d", 
             "G" = "#ffae01", 
             "C" = "#f24641", 
             "T" = "#4294fa", 
             "K" = "#8b4816",
             "M" = "#83831f",
             "R" = "#ffff81",
             "S" = "#ff9d80",
             "Y" = "#e381f2",
             "W" = "#80fff2",
             "V" = "#fde4b8",
             "B" = "#f9c1bf",
             "H" = "#c0d9f9",
             "D" = "#c7ffba",
             "U" = "#8989fb",
             "N" = "black", 
             "-" = "white",
             "+" = "White")


pal_df <- data.frame(names = names(palette), col = palette)

Combining sequence/metadata

# Convert to long format
seqs_long <- seqs %>%
  filter(!is.na(sample_name)) %>%
  select(sample_name, `A1`:`1275234_G`) %>%
  pivot_longer(`A1`:`1275234_G`) %>%
  filter(value > 0) %>% # Remove zero values
  left_join(., meta)

# Q. Are we working with the post-med seqs according to the metadata in seqs?
san_check <- seqs_long %>%
  group_by(sample_name) %>%
  summarise(total = sum(value)) #A. yes

# Create a list of samples to keep that didnt fail to sequence
keepers_ss <- san_check %>%
  filter(total > 1500)

#we filter out 3 samples with less than 1500 reads: Plate4_B007, Plate5_G011, Plate7_D009

# Filter out the failed samples
seqs_long <- seqs_long %>%
  filter(sample_name %in% keepers_ss$sample_name) %>%
  group_by(sample_name) %>%
  mutate(value_rel = value/sum(value)) %>% # Convert to relative abundance
  ungroup() %>%
  mutate(name = as.factor(name)) # Make sample names a factor

# Create a random palette for each sequence
n <- length(levels(seqs_long$name))
seqs_pal = rainbow(n, s=.6, v=.9)[sample(1:n,n, replace = FALSE)]
names(seqs_pal) <- levels(seqs_long$name)

# Read in the profile data
profiles_raw <- read_tsv("20210612_marzonie/186_20211115_03_DBV_20211116T024440.profiles.absolute.abund_and_meta.txt", skip = 6) %>%
    select(sample_name = `...2`, `A1/A1h`:`C42a/C1-C42.2`) %>%
   mutate(plate = case_when(str_detect(sample_name, "Plate1") ~ "Plate1",
                           str_detect(sample_name, "Plate2") ~ "Plate2",
                           str_detect(sample_name, "Plate3") ~ "Plate3",
                           str_detect(sample_name, "Plate4") ~ "Plate4",
                           str_detect(sample_name, "Plate5") ~ "Plate5", 
                           str_detect(sample_name, "Plate6") ~ "Plate6", 
                           str_detect(sample_name, "Plate7") ~ "Plate7", 
                           TRUE ~ "Other"),
         position = str_sub(sample_name, start = 8, end = 11),
         plate_position = paste0(plate, "_", position)) %>%
    filter(!is.na(sample_name)) %>%
  select(sample_name = plate_position, -plate, -position, `A1/A1h`:`C42a/C1-C42.2`)

#Convert to long format 
profiles_long <- profiles_raw %>%
  pivot_longer(`A1/A1h`:`C42a/C1-C42.2`) %>% # Convert it to long format
  mutate(name = paste0("p_", name)) %>% # Add a p_ to the beginning of each profile (Some profiles are single sequence profiles and clash with the Sequence names)
  filter(sample_name %in% seqs_long$sample_name) %>% # Remove samples that dont appear in the Sequence dataframe
  group_by(sample_name) %>%
  mutate(value = as.numeric(value)) %>%
  filter(value > 0) %>% # Remove 0 abundance profiles
  mutate(sample_name = as.factor(sample_name),
         name = as.factor(name)) %>% 
  ungroup() %>%
  left_join(., meta) # Add in metadata

# What is the total number of profile-related sequences in each sample?
profiles_sum <- profiles_long %>%
  group_by(sample_name) %>%
  summarise(total = sum(value))

# How many sequences in each sample are not part of a profile?
residual <- left_join(profiles_sum, san_check, by = "sample_name") %>%
  mutate(residual = total.y - total.x) %>%
  select(sample_name, value = residual) %>%
  mutate(name = "non-profile sequences") %>%
  left_join(., meta)

# Combine the profiles and non-profile sequences
profile_data <- rbind(profiles_long, residual) %>%
  group_by(sample_name) %>%
  mutate(value_rel = value/sum(value)) # convert to relative abundance - in that sample 

# Create palette for profiles (this is a darker palette)
n <- length(levels(profile_data$name))
profile_pal = rainbow(n, s=.6, v=.6)[sample(1:n,n, replace = FALSE)]
names(profile_pal) <- levels(profile_data$name)

# Merge the palettes and replace the non-profile sequences with grey
all_pal <- c(seqs_pal, profile_pal)
all_pal['non-profile sequences'] <- "#808080" 

# Join profiles and sequence data together into single dataframe and add more metadata
all_data <- rbind(seqs_long, profile_data) %>%
  mutate(coral_genus = case_when(str_detect(mtORF, "mean|verru|aplo|nkno") ~ "Pocillopora",
                                 str_detect(mtORF, "humil") ~ "Acropora",
                                 TRUE ~ "check"))

all_data %>% filter(coral_genus != "Acropora") %>% distinct(sample_name)
## # A tibble: 346 × 1
##    sample_name
##    <fct>      
##  1 Plate2_G008
##  2 Plate3_D008
##  3 Plate3_G007
##  4 Plate2_F001
##  5 Plate3_H008
##  6 Plate4_C005
##  7 Plate2_B004
##  8 Plate3_E009
##  9 Plate2_G006
## 10 Plate4_E004
## # … with 336 more rows

3.1 Library Stats

##Basic Stats

# How many samples per species?
all_data %>%
  distinct(sample_name, mtORF) %>%
  group_by(mtORF) %>% 
  summarise(total_samples = n())
## # A tibble: 5 × 2
##   mtORF       total_samples
##   <chr>               <int>
## 1 Ahumilis              260
## 2 Haplotype8a            39
## 3 Pmeandrina            133
## 4 Pverrucosa            152
## 5 Unknown                22
# Ahumilis  260         
# Haplotype8a   39          
# Pmeandrina    133         
# Pverrucosa    152         
# Unknown   22

humil_total <- 260
hap8a_total <- 39
veru_total <- 152
mean_total <- 133

#remove unknowns from downstream analysis
clean_data <- all_data %>%
  filter(mtORF != "Unknown")

# Total number of sequences (for whole library)? 
study_total <- clean_data %>% 
   filter(!(str_detect(name, "p_")),
         name != "non-profile sequences") %>% 
  summarise(total_seqs = sum(value)) %>%
  pull(total_seqs)
#15,336,586 total   

#Total number of sequences (per species)? 
clean_data %>%
    filter(!(str_detect(name, "p_")),
         name != "non-profile sequences") %>% 
  group_by(mtORF) %>%
  summarise(total_seqs = sum(value))
## # A tibble: 4 × 2
##   mtORF       total_seqs
##   <chr>            <dbl>
## 1 Ahumilis       6627245
## 2 Haplotype8a    1025443
## 3 Pmeandrina     3512508
## 4 Pverrucosa     4171390
# Ahumilis  6627245         
# Haplotype8a   1025443         
# Pmeandrina    3512508         
# Pverrucosa    4171390 

Sequencing Depth

# Average per sample sequencing depth? (per coral host genus, and per coral species)

# Average per sample across all spp
clean_data %>%
  group_by(sample_name) %>% 
  dplyr::summarise(per_sample = sum(value)) %>%
  dplyr::summarise(mean_all = mean(per_sample))
## # A tibble: 1 × 1
##   mean_all
##      <dbl>
## 1   52523.
# Across all:  52522.55 

#average per sample per coral host genus 
clean_data %>%
  group_by(coral_genus, sample_name) %>% 
  summarise(per_sample = sum(value)) %>%
  summarise(mean_all = mean(per_sample), n = n())
## # A tibble: 2 × 3
##   coral_genus mean_all     n
##   <chr>          <dbl> <int>
## 1 Acropora      50979.   260
## 2 Pocillopora   53761.   324
# Acropora  50978.81    260     
# Pocillopora   53761.36    324 


#average per sample per coral host species 
clean_data %>% 
  group_by(mtORF, sample_name) %>% 
  summarise(per_sample = sum(value)) %>%
  summarise(mean_all = mean(per_sample), n = n())
## # A tibble: 4 × 3
##   mtORF       mean_all     n
##   <chr>          <dbl> <int>
## 1 Ahumilis      50979.   260
## 2 Haplotype8a   52587.    39
## 3 Pmeandrina    52820.   133
## 4 Pverrucosa    54887.   152
# Species
# Ahumilis  50978.81    260     
# Haplotype8a   52586.82    39      
# Pmeandrina    52819.67    133     
# Pverrucosa    54886.71    152     

##Number of sequences per symbiont genus

# Total number of Cladocopium 
clean_data %>%
  filter(!(str_detect(name, "p_")), name != "non-profile sequences") %>%
  filter(str_sub(name, 1, 1) == "C" | str_detect(name, "_C")) %>% 
  summarise(sum = sum(value))
## # A tibble: 1 × 1
##        sum
##      <dbl>
## 1 15335016
# 15335016  

(15335016 / study_total) * 100 # 99.98976 Clado
## [1] 99.98976
#Total number of Symbiodinium
clean_data %>%
  filter(!(str_detect(name, "p_")), name != "non-profile sequences") %>%
  filter(str_sub(name, 1, 1) == "A" | str_detect(name, "_A")) %>% 
  summarise(sum = sum(value)) 
## # A tibble: 1 × 1
##     sum
##   <dbl>
## 1  1552
# 1552  total seqs
(1552 / study_total) * 100 # 0.01011959 Symbio
## [1] 0.01011959
# Total number of Durusdinium sequences
clean_data %>%
  filter(!(str_detect(name, "p_")), name != "non-profile sequences") %>%
  filter(str_sub(name, 1, 1) == "D" | str_detect(name, "_D")) %>% 
  summarise(sum = sum(value)) 
## # A tibble: 1 × 1
##     sum
##   <dbl>
## 1    13
# 13 total reads

(13 / study_total) * 100 # 8.476463e-05 Durusdinium
## [1] 8.476463e-05

#3.2 Characteristics of symbiont DIVs and Type Profiles

# general statements that apply to both species 

# what proportion of each sample are composed of non-profile sequences (grouped by coral host species)
clean_data %>%
  filter(str_detect(name, "non")) %>%
  group_by(mtORF) %>%
  summarise(mean = mean(value_rel))
## # A tibble: 4 × 2
##   mtORF        mean
##   <chr>       <dbl>
## 1 Ahumilis    0.245
## 2 Haplotype8a 0.176
## 3 Pmeandrina  0.179
## 4 Pverrucosa  0.139
# Ahumilis  0.2454587           
# Haplotype8a   0.1763951           
# Pmeandrina    0.1792804           
# Pverrucosa    0.1393484   

# TO DO FURTHER DOWN - humilis split, one side of the unifrac tree has a greater prop of non-profile seqs than the other

clean_data %>%
  filter(str_detect(name, "non")) %>%
  group_by(coral_genus) %>%
  summarise(mean = mean(value_rel))
## # A tibble: 2 × 2
##   coral_genus  mean
##   <chr>       <dbl>
## 1 Acropora    0.245
## 2 Pocillopora 0.160
# Acropora  0.2454587           
# Pocillopora   0.1601996   

# Acropora has a lot more non-profile sequences than any Pocillopora. 

Total number of Type Profiles

library(tidyverse)

#total type profiles across Pocilloporidae 
clean_data %>%
  filter(coral_genus == "Pocillopora") %>% 
  filter(str_detect(name, "p_")) %>%    #profiles start with p_
  group_by(name) %>%
  dplyr:: count() %>%
  dplyr:: arrange(desc(n)) %>%
  ungroup() %>%
  mutate(prop = n/sum(n)) %>%
  mutate(cumulative_sum = cumsum(prop))
## # A tibble: 26 × 4
##    name                                                n   prop cumulative_sum
##    <fct>                                           <int>  <dbl>          <dbl>
##  1 p_C1/C42.2/C42g/C42a-C1b-C1au-C1az-C42h-C3         97 0.296           0.296
##  2 p_C1d/C1/C42.2/C3-C1b-C3cg-C115k-C45c-C1au-C41p    71 0.216           0.512
##  3 p_C1/C42.2/C42u-C1b-C42a-C1au-C115l-C1az-C115d     43 0.131           0.643
##  4 p_C1d/C1/C42.2-C3cg-C1b-C45c-C115k-C1au            36 0.110           0.753
##  5 p_C1d/C1/C1ba-C42.2-C3cg-C1b-C115k                 13 0.0396          0.793
##  6 p_C42u/C42a/C1/C42.2/C42b-C1b-C1au                 11 0.0335          0.826
##  7 p_C42.2/C1dh/C1/C1d-C1b-C3cg                        9 0.0274          0.854
##  8 p_C42.2/C1/C42a/C1b-C1au-C1az-C3-C115d              6 0.0183          0.872
##  9 p_C1ag/C1/C1ah-C42.2-C3cg-C1b                       6 0.0183          0.890
## 10 p_C42g/C42a/C42.2/C1-C42h-C42b-C1b-C1au             5 0.0152          0.905
## # … with 16 more rows
# profile n_samps prop_samps cumulative_sum
# p_C1/C42.2/C42g/C42a-C1b-C1au-C1az-C42h-C3    97  0.295731707 0.2957317   
# p_C1d/C1/C42.2/C3-C1b-C3cg-C115k-C45c-C1au-C41p   71  0.216463415 0.5121951   
# p_C1/C42.2/C42u-C1b-C42a-C1au-C115l-C1az-C115d    43  0.131097561 0.6432927   
# p_C1d/C1/C42.2-C3cg-C1b-C45c-C115k-C1au   36  0.109756098 0.7530488   
# p_C1d/C1/C1ba-C42.2-C3cg-C1b-C115k    13  0.039634146 0.7926829   
# p_C42u/C42a/C1/C42.2/C42b-C1b-C1au    11  0.033536585 0.8262195   
# p_C42.2/C1dh/C1/C1d-C1b-C3cg  9   0.027439024 0.8536585   
# p_C42.2/C1/C42a/C1b-C1au-C1az-C3-C115d    6   0.018292683 0.8719512   
# p_C1ag/C1/C1ah-C42.2-C3cg-C1b 6   0.018292683 0.8902439   
# p_C42g/C42a/C42.2/C1-C42h-C42b-C1b-C1au   5   0.015243902 0.9054878   
# p_C1d-C42.2-C1-C1k-C1b-C3cg   5   0.015243902 0.9207317   
# p_C42.2/C1/C42a-C1b-C42g-C42b-C1au-C1az   5   0.015243902 0.9359756   
# p_C1ag/C1/C42.2-C3cg-C1b-C1bi 4   0.012195122 0.9481707   
# p_C1ag-C1-C42.2-C1bi-C3cg-C1b-C1bk    2   0.006097561 0.9542683   
# p_C1d/C15h-C1-C42.2   2   0.006097561 0.9603659   
# p_C42u-C1-C42.2-C42a-C1b-C1au-C115k-C115d 2   0.006097561 0.9664634   
# p_C1ag/C1/C42.2/C1bi-C3cg-C1b-C3cw-C45c   2   0.006097561 0.9725610   
# p_C3-C3k  1   0.003048780 0.9756098   
# p_C42g-C42.2-C1-C1b-C42h-C42a-C42ba-C42b  1   0.003048780 0.9786585   
# p_C42a/C1/C42.2/C1b/C1j-C1au-C3-C115l 1   0.003048780 0.9817073   
# p_A1/A1h  1   0.003048780 0.9847561   
# p_C15h    1   0.003048780 0.9878049   
# p_C1ag/C1-C42.2-C3cg-C1b  1   0.003048780 0.9908537   
# p_C1d 1   0.003048780 0.9939024   
# p_C1ag/C1-C1m-C42.2-C1lr-C1bi-C3cg    1   0.003048780 0.9969512   
# p_C1/C42.2/C42g/C42a-C1b-C1az-C3-C1au 1   0.003048780 1.0000000   

#total number of profiles, in order of highest abundance across P. verrucosa samples 
clean_data %>%
  filter(mtORF == "Pverrucosa") %>% 
  filter(str_detect(name, "p_")) %>%    #profiles start with p_
  group_by(name) %>%
  dplyr:: count() %>%
  dplyr:: arrange(desc(n)) %>%
  ungroup() %>%
  mutate(prop = n/sum(n)) %>%
  mutate(cumulative_sum = cumsum(prop))
## # A tibble: 16 × 4
##    name                                                n    prop cumulative_sum
##    <fct>                                           <int>   <dbl>          <dbl>
##  1 p_C1d/C1/C42.2/C3-C1b-C3cg-C115k-C45c-C1au-C41p    71 0.461            0.461
##  2 p_C1d/C1/C42.2-C3cg-C1b-C45c-C115k-C1au            36 0.234            0.695
##  3 p_C1d/C1/C1ba-C42.2-C3cg-C1b-C115k                 13 0.0844           0.779
##  4 p_C42.2/C1dh/C1/C1d-C1b-C3cg                        8 0.0519           0.831
##  5 p_C1ag/C1/C1ah-C42.2-C3cg-C1b                       6 0.0390           0.870
##  6 p_C1d-C42.2-C1-C1k-C1b-C3cg                         5 0.0325           0.903
##  7 p_C1ag/C1/C42.2-C3cg-C1b-C1bi                       4 0.0260           0.929
##  8 p_C1ag-C1-C42.2-C1bi-C3cg-C1b-C1bk                  2 0.0130           0.942
##  9 p_C1d/C15h-C1-C42.2                                 2 0.0130           0.955
## 10 p_C1/C42.2/C42g/C42a-C1b-C1au-C1az-C42h-C3          1 0.00649          0.961
## 11 p_C1/C42.2/C42u-C1b-C42a-C1au-C115l-C1az-C115d      1 0.00649          0.968
## 12 p_C3-C3k                                            1 0.00649          0.974
## 13 p_C1ag/C1-C42.2-C3cg-C1b                            1 0.00649          0.981
## 14 p_C1d                                               1 0.00649          0.987
## 15 p_C1ag/C1/C42.2/C1bi-C3cg-C1b-C3cw-C45c             1 0.00649          0.994
## 16 p_C1ag/C1-C1m-C42.2-C1lr-C1bi-C3cg                  1 0.00649          1
verru_pros <- 22

# p_C1d/C1/C42.2/C3-C1b-C3cg-C115k-C45c-C1au-C41p   71  0.461038961 0.4610390   
# p_C1d/C1/C42.2-C3cg-C1b-C45c-C115k-C1au   36  0.233766234 0.6948052   
# p_C1d/C1/C1ba-C42.2-C3cg-C1b-C115k    13  0.084415584 0.7792208   
# p_C42.2/C1dh/C1/C1d-C1b-C3cg  8   0.051948052 0.8311688   
# p_C1ag/C1/C1ah-C42.2-C3cg-C1b 6   0.038961039 0.8701299   
# p_C1d-C42.2-C1-C1k-C1b-C3cg   5   0.032467532 0.9025974   
# p_C1ag/C1/C42.2-C3cg-C1b-C1bi 4   0.025974026 0.9285714   
# p_C1ag-C1-C42.2-C1bi-C3cg-C1b-C1bk    2   0.012987013 0.9415584   
# p_C1d/C15h-C1-C42.2   2   0.012987013 0.9545455   
# p_C1/C42.2/C42g/C42a-C1b-C1au-C1az-C42h-C3    1   0.006493506 0.9610390   
# p_C1/C42.2/C42u-C1b-C42a-C1au-C115l-C1az-C115d    1   0.006493506 0.9675325   
# p_C3-C3k  1   0.006493506 0.9740260   
# p_C1ag/C1-C42.2-C3cg-C1b  1   0.006493506 0.9805195   
# p_C1d 1   0.006493506 0.9870130   
# p_C1ag/C1/C42.2/C1bi-C3cg-C1b-C3cw-C45c   1   0.006493506 0.9935065   
# p_C1ag/C1-C1m-C42.2-C1lr-C1bi-C3cg    1   0.006493506 1.0000000   

clean_data %>%
  filter(mtORF == "Pmeandrina") %>% 
  filter(str_detect(name, "p_")) %>%
  group_by(name) %>%
  dplyr:: count() %>%
  dplyr:: arrange(desc(n)) %>%
  ungroup() %>%
  mutate(prop = n/sum(n)) %>%
  mutate(cumulative_sum = cumsum(prop))
## # A tibble: 12 × 4
##    name                                               n    prop cumulative_sum
##    <fct>                                          <int>   <dbl>          <dbl>
##  1 p_C1/C42.2/C42g/C42a-C1b-C1au-C1az-C42h-C3        74 0.548            0.548
##  2 p_C1/C42.2/C42u-C1b-C42a-C1au-C115l-C1az-C115d    33 0.244            0.793
##  3 p_C42u/C42a/C1/C42.2/C42b-C1b-C1au                 9 0.0667           0.859
##  4 p_C42.2/C1/C42a/C1b-C1au-C1az-C3-C115d             5 0.0370           0.896
##  5 p_C42.2/C1/C42a-C1b-C42g-C42b-C1au-C1az            5 0.0370           0.933
##  6 p_C42g/C42a/C42.2/C1-C42h-C42b-C1b-C1au            3 0.0222           0.956
##  7 p_C42g-C42.2-C1-C1b-C42h-C42a-C42ba-C42b           1 0.00741          0.963
##  8 p_C42a/C1/C42.2/C1b/C1j-C1au-C3-C115l              1 0.00741          0.970
##  9 p_A1/A1h                                           1 0.00741          0.978
## 10 p_C15h                                             1 0.00741          0.985
## 11 p_C42.2/C1dh/C1/C1d-C1b-C3cg                       1 0.00741          0.993
## 12 p_C1ag/C1/C42.2/C1bi-C3cg-C1b-C3cw-C45c            1 0.00741          1
mean_pros <- 12

# p_C1/C42.2/C42g/C42a-C1b-C1au-C1az-C42h-C3    74  0.548148148 0.5481481   
# p_C1/C42.2/C42u-C1b-C42a-C1au-C115l-C1az-C115d    33  0.244444444 0.7925926   
# p_C42u/C42a/C1/C42.2/C42b-C1b-C1au    9   0.066666667 0.8592593   
# p_C42.2/C1/C42a/C1b-C1au-C1az-C3-C115d    5   0.037037037 0.8962963   
# p_C42.2/C1/C42a-C1b-C42g-C42b-C1au-C1az   5   0.037037037 0.9333333   
# p_C42g/C42a/C42.2/C1-C42h-C42b-C1b-C1au   3   0.022222222 0.9555556   
# p_C42g-C42.2-C1-C1b-C42h-C42a-C42ba-C42b  1   0.007407407 0.9629630   
# p_C42a/C1/C42.2/C1b/C1j-C1au-C3-C115l 1   0.007407407 0.9703704   
# p_A1/A1h  1   0.007407407 0.9777778   
# p_C15h    1   0.007407407 0.9851852   
# p_C42.2/C1dh/C1/C1d-C1b-C3cg  1   0.007407407 0.9925926   
# p_C1ag/C1/C42.2/C1bi-C3cg-C1b-C3cw-C45c   1   0.007407407 1.0000000   

clean_data %>%
  filter(mtORF == "Haplotype8a") %>% 
  filter(str_detect(name, "p_")) %>%    #profiles start with p_
  group_by(name) %>%
  dplyr:: count() %>%
  dplyr:: arrange(desc(n)) %>%
  ungroup() %>%
  mutate(prop = n/sum(n)) %>%
  mutate(cumulative_sum = cumsum(prop))
## # A tibble: 7 × 4
##   name                                               n   prop cumulative_sum
##   <fct>                                          <int>  <dbl>          <dbl>
## 1 p_C1/C42.2/C42g/C42a-C1b-C1au-C1az-C42h-C3        22 0.564           0.564
## 2 p_C1/C42.2/C42u-C1b-C42a-C1au-C115l-C1az-C115d     9 0.231           0.795
## 3 p_C42u/C42a/C1/C42.2/C42b-C1b-C1au                 2 0.0513          0.846
## 4 p_C42g/C42a/C42.2/C1-C42h-C42b-C1b-C1au            2 0.0513          0.897
## 5 p_C42u-C1-C42.2-C42a-C1b-C1au-C115k-C115d          2 0.0513          0.949
## 6 p_C42.2/C1/C42a/C1b-C1au-C1az-C3-C115d             1 0.0256          0.974
## 7 p_C1/C42.2/C42g/C42a-C1b-C1az-C3-C1au              1 0.0256          1
hap8a_pros <- 7

# p_C1/C42.2/C42g/C42a-C1b-C1au-C1az-C42h-C3    22  0.56410256  0.5641026   
# p_C1/C42.2/C42u-C1b-C42a-C1au-C115l-C1az-C115d    9   0.23076923  0.7948718   
# p_C42u/C42a/C1/C42.2/C42b-C1b-C1au    2   0.05128205  0.8461538   
# p_C42g/C42a/C42.2/C1-C42h-C42b-C1b-C1au   2   0.05128205  0.8974359   
# p_C42u-C1-C42.2-C42a-C1b-C1au-C115k-C115d 2   0.05128205  0.9487179   
# p_C42.2/C1/C42a/C1b-C1au-C1az-C3-C115d    1   0.02564103  0.9743590   
# p_C1/C42.2/C42g/C42a-C1b-C1az-C3-C1au 1   0.02564103  1.0000000   

clean_data %>%
  filter(mtORF == "Ahumilis") %>% 
  filter(str_detect(name, "p_")) %>%    #profiles start with p_
  group_by(name) %>%
  dplyr:: count() %>%
  dplyr:: arrange(desc(n)) %>%
  ungroup() %>%
  mutate(prop = n/sum(n)) %>%
  mutate(cumulative_sum = cumsum(prop))
## # A tibble: 23 × 4
##    name                                     n    prop cumulative_sum
##    <fct>                                <int>   <dbl>          <dbl>
##  1 p_C3k/C3-C50a-C29-C21ab-C3b            160 0.593            0.593
##  2 p_C3k/C3-C50a-C21ab-C50f-C3ba-C3dq      52 0.193            0.785
##  3 p_C3k/C3-C50a-C3ba-C50f-C3dq-C21-C3a    14 0.0519           0.837
##  4 p_C3k/C3-C50a-C3jv-C3vx-C3vy            12 0.0444           0.881
##  5 p_C3bo/C3k-C3-C3bp-C50a-C29              6 0.0222           0.904
##  6 p_C3k/C3/C1-C50a-C21ab                   4 0.0148           0.919
##  7 p_C3k-C3-C50a-C21ab-C3b                  3 0.0111           0.930
##  8 p_C3k-C3-C50a-C21ab-C50f-C3ba            2 0.00741          0.937
##  9 p_C1/C42.2                               2 0.00741          0.944
## 10 p_C1/C1c                                 2 0.00741          0.952
## # … with 13 more rows
humil_pros <- 23

# p_C3k/C3-C50a-C29-C21ab-C3b   160 0.592592593 0.5925926   
# p_C3k/C3-C50a-C21ab-C50f-C3ba-C3dq    52  0.192592593 0.7851852   
# p_C3k/C3-C50a-C3ba-C50f-C3dq-C21-C3a  14  0.051851852 0.8370370   
# p_C3k/C3-C50a-C3jv-C3vx-C3vy  12  0.044444444 0.8814815   
# p_C3bo/C3k-C3-C3bp-C50a-C29   6   0.022222222 0.9037037   
# p_C3k/C3/C1-C50a-C21ab    4   0.014814815 0.9185185   
# p_C3k-C3-C50a-C21ab-C3b   3   0.011111111 0.9296296   
# p_C3k-C3-C50a-C21ab-C50f-C3ba 2   0.007407407 0.9370370   
# p_C1/C42.2    2   0.007407407 0.9444444   
# p_C1/C1c  2   0.007407407 0.9518519   
# p_C1/C42.2/C42g/C42a-C1b-C1au-C1az-C42h-C3    1   0.003703704 0.9555556   
# p_A1  1   0.003703704 0.9592593   
# p_C3/C3k-C29-C21ab-C3b-C3gj-C21.12    1   0.003703704 0.9629630   
# p_C3-C21-C3k-C3at-C3b-C3av-C3dp   1   0.003703704 0.9666667   
# p_C40/C1-C3-C115  1   0.003703704 0.9703704   
# p_C42.2/C42a/C1-C1b   1   0.003703704 0.9740741   
# p_C1d/C1  1   0.003703704 0.9777778   
# p_C1/C3k-C1b-C3-C42.2-C1bh-C1br   1   0.003703704 0.9814815   
# p_C3/C21/C3av-C3at-C3b-C3dp   1   0.003703704 0.9851852   
# p_C3k-C50a-C3cz   1   0.003703704 0.9888889   
# p_C42a/C1-C42.2   1   0.003703704 0.9925926   
# p_C1/C3-C1c-C1b-C1w   1   0.003703704 0.9962963   
# p_C3k/C50a    1   0.003703704 1.0000000       
#What is the proportion of samples that had 1 and 2 type profiles?

clean_data %>% 
 filter(coral_genus == "Pocillopora") %>% 
 filter(str_detect(name, "p_")) %>%
 group_by(sample_name) %>% 
 summarise(n = n()) %>% 
 filter(n == 1)     #320  samples have 1 type profile
## # A tibble: 320 × 2
##    sample_name     n
##    <fct>       <int>
##  1 Plate1_A001     1
##  2 Plate1_A002     1
##  3 Plate1_A003     1
##  4 Plate1_A004     1
##  5 Plate1_A005     1
##  6 Plate1_A006     1
##  7 Plate1_A008     1
##  8 Plate1_A009     1
##  9 Plate1_A010     1
## 10 Plate1_A011     1
## # … with 310 more rows
 #filter(n == 2)     #4 samples have 2 type profiles

clean_data %>% 
  filter(coral_genus == "Acropora") %>% 
  filter(str_detect(name, "p_")) %>%
  group_by(sample_name) %>% 
  summarise(n = n()) %>% 
 #filter(n == 1)     #250 samples have 1 type profile
 filter(n == 2)     #10 samples have 2 type profiles 
## # A tibble: 10 × 2
##    sample_name     n
##    <fct>       <int>
##  1 Plate6_A001     2
##  2 Plate6_D009     2
##  3 Plate6_F009     2
##  4 Plate6_F011     2
##  5 Plate7_A006     2
##  6 Plate7_A010     2
##  7 Plate7_B010     2
##  8 Plate7_D006     2
##  9 Plate7_E006     2
## 10 Plate7_H009     2

Majority Sequences

#pverrucosa 
clean_data %>% 
  filter(mtORF == "Pverrucosa") %>% 
  filter(str_detect(name, c("p_"))) %>% 
  filter(str_detect(name, c("p_C1d")))
## # A tibble: 128 × 47
##    sample_name name    value Refer…¹ Vial  Species mtORF psba_…² psba_…³ psba_…⁴
##    <fct>       <fct>   <dbl>   <int> <chr> <chr>   <chr> <chr>   <chr>   <chr>  
##  1 Plate4_E003 p_C1d/… 27948     426 P ve… Pverru… Pver… ""      ""      ""     
##  2 Plate1_E006 p_C1d/… 18997     459 P ve… Pverru… Pver… ""      ""      ""     
##  3 Plate4_D005 p_C1d/… 18172     448 P ve… Pverru… Pver… "PverC… "Bad"   "Bad"  
##  4 Plate2_C011 p_C1d/… 27531     519 P ve… Pverru… Pver… ""      ""      ""     
##  5 Plate4_A003 p_C1d/… 26506     420 P ve… Pverru… Pver… ""      ""      ""     
##  6 Plate1_F011 p_C1d/… 25657     541 P ve… Pverru… Pver… ""      ""      ""     
##  7 Plate1_G008 p_C1d/… 32542     480 P ve… Pverru… Pver… ""      ""      ""     
##  8 Plate2_H010 p_C1d/… 21499     515 P ve… Pverru… Pver… ""      ""      ""     
##  9 Plate3_C004 p_C1d/… 19773     425 P ve… Pverru… Pver… ""      ""      ""     
## 10 Plate1_C011 p_C1d/… 29084     538 P ve… Pverru… Pver… ""      ""      ""     
## # … with 118 more rows, 37 more variables: psba_ID <chr>, mt_forwardQC <chr>,
## #   mt_reverseQC <chr>, mtorf_ID <chr>, Reef <chr>, Site <chr>,
## #   DateCollect <chr>, catBleaching <int>, Depth <dbl>, DHW <chr>, Lat <dbl>,
## #   Long <dbl>, Exposure <chr>, Aspect <chr>, Sector <chr>, maxDHW <dbl>,
## #   meanDHW <dbl>, recent.maxDHW <dbl>, recent.meanDHW <dbl>, DHW2 <int>,
## #   DHW3 <int>, DHW4 <int>, DHW6 <int>, DHW8 <int>, DHW9 <int>,
## #   returnDHW3 <dbl>, returnDHW4 <dbl>, returnDHW6 <dbl>, meanSST <dbl>, …
128 / veru_total 
## [1] 0.8421053
#found as majority sequence in 84.2% of P. verrucosa samples 


#p.mea/haplotype 8a 
clean_data %>% 
 filter(mtORF == "Pmeandrina" | mtORF == "Haplotype8a") %>% 
 filter(str_detect(name, c("p_"))) %>% 
 filter(str_detect(name, c("p_C1|p_C42.2"))) 
## # A tibble: 153 × 47
##    sample_name name    value Refer…¹ Vial  Species mtORF psba_…² psba_…³ psba_…⁴
##    <fct>       <fct>   <dbl>   <int> <chr> <chr>   <chr> <chr>   <chr>   <chr>  
##  1 Plate2_G008 p_C1/C… 26362     365 P ve… Pmeand… Pmea… ""      ""      ""     
##  2 Plate3_D008 p_C1/C… 20767     591 P ve… Unknown Hapl… "PunkC… "Bad"   "Bad"  
##  3 Plate3_G007 p_C1/C… 25118     360 P ve… Pmeand… Pmea… ""      ""      ""     
##  4 Plate2_F001 p_C1/C… 23410     277 P ve… Pmeand… Pmea… ""      ""      ""     
##  5 Plate3_H008 p_C1/C… 29095     595 P ve… Unknown Hapl… ""      ""      ""     
##  6 Plate4_C005 p_C1/C… 19389     329 P ve… Pmeand… Pmea… ""      ""      ""     
##  7 Plate2_B004 p_C1/C… 26328     315 P ve… Pmeand… Pmea… ""      ""      ""     
##  8 Plate3_E009 p_C1/C… 29186     597 P ve… Unknown Hapl… ""      ""      ""     
##  9 Plate2_G006 p_C1/C… 22104     585 P ve… Unknown Hapl… "PunkC… "Good"  "Bad"  
## 10 Plate4_E004 p_C1/C… 24129     327 P ve… Pmeand… Pmea… ""      ""      ""     
## # … with 143 more rows, 37 more variables: psba_ID <chr>, mt_forwardQC <chr>,
## #   mt_reverseQC <chr>, mtorf_ID <chr>, Reef <chr>, Site <chr>,
## #   DateCollect <chr>, catBleaching <int>, Depth <dbl>, DHW <chr>, Lat <dbl>,
## #   Long <dbl>, Exposure <chr>, Aspect <chr>, Sector <chr>, maxDHW <dbl>,
## #   meanDHW <dbl>, recent.maxDHW <dbl>, recent.meanDHW <dbl>, DHW2 <int>,
## #   DHW3 <int>, DHW4 <int>, DHW6 <int>, DHW8 <int>, DHW9 <int>,
## #   returnDHW3 <dbl>, returnDHW4 <dbl>, returnDHW6 <dbl>, meanSST <dbl>, …
153 /(mean_total + hap8a_total) # 0.8895349
## [1] 0.8895349
#acropora 
clean_data %>% 
  filter(mtORF == "Ahumilis") %>% 
    filter(str_detect(name, c("p_"))) %>% 
    filter(str_detect(name, c("p_C3k|p_C3"))) 
## # A tibble: 258 × 47
##    sample_name name    value Refer…¹ Vial  Species mtORF psba_…² psba_…³ psba_…⁴
##    <fct>       <fct>   <dbl>   <int> <chr> <chr>   <chr> <chr>   <chr>   <chr>  
##  1 Plate5_C011 p_C3k/… 20465     208 A te… Ahumil… Ahum… ""      ""      ""     
##  2 Plate5_F007 p_C3k/… 27117     129 A te… Ahumil… Ahum… ""      ""      ""     
##  3 Plate5_D012 p_C3k/… 13819     244 A te… Ahumil… Ahum… ""      ""      ""     
##  4 Plate7_B007 p_C3k/… 22306     205 A te… Ahumil… Ahum… "AcroC… "Good"  "Good" 
##  5 Plate7_F002 p_C3k/… 23441      88 A te… Ahumil… Ahum… ""      ""      ""     
##  6 Plate6_A010 p_C3k/… 24848     204 A te… Ahumil… Ahum… ""      ""      ""     
##  7 Plate7_G004 p_C3k/… 17492     151 A te… Ahumil… Ahum… ""      ""      ""     
##  8 Plate7_A004 p_C3k/… 24349     136 A te… Ahumil… Ahum… ""      ""      ""     
##  9 Plate7_G009 p_C3k/… 22047     253 A te… Ahumil… Ahum… ""      ""      ""     
## 10 Plate7_F007 p_C3k/…  7075     217 A te… Ahumil… Ahum… ""      ""      ""     
## # … with 248 more rows, 37 more variables: psba_ID <chr>, mt_forwardQC <chr>,
## #   mt_reverseQC <chr>, mtorf_ID <chr>, Reef <chr>, Site <chr>,
## #   DateCollect <chr>, catBleaching <int>, Depth <dbl>, DHW <chr>, Lat <dbl>,
## #   Long <dbl>, Exposure <chr>, Aspect <chr>, Sector <chr>, maxDHW <dbl>,
## #   meanDHW <dbl>, recent.maxDHW <dbl>, recent.meanDHW <dbl>, DHW2 <int>,
## #   DHW3 <int>, DHW4 <int>, DHW6 <int>, DHW8 <int>, DHW9 <int>,
## #   returnDHW3 <dbl>, returnDHW4 <dbl>, returnDHW6 <dbl>, meanSST <dbl>, …
258 / humil_total
## [1] 0.9923077
#C3k/C3 is majority sequence in 0.9923077 of all samples 

Table S1.

#How many samples collected from each reef, for each type of coral species? 
meta %>% dplyr::select(c(Vial, Reef, mtORF)) %>% 
  filter(mtORF != "Unknown") %>% 
  group_by(Reef, mtORF) %>% 
  summarise(total_samples = n()) %>%
  pivot_wider(names_from = mtORF, values_from = total_samples)
## # A tibble: 13 × 5
## # Groups:   Reef [13]
##    Reef         Ahumilis Haplotype8a Pmeandrina Pverrucosa
##    <chr>           <int>       <int>      <int>      <int>
##  1 Bougainville       22           6         10         18
##  2 Chilcott           18           2         10          3
##  3 Flinders           30           2          5         26
##  4 Frederick          17           2         13          2
##  5 Herald             19           3          5         11
##  6 Holmes             30           2          7         20
##  7 Lihou              32           1          6         18
##  8 Marion             36           6         11         13
##  9 Moore              20           3         19         10
## 10 Osprey             14          NA          4         11
## 11 Saumarez           NA           5         22         NA
## 12 Willis             NA           1          9         14
## 13 Wreck              24           6         13          6

3.2 UPGMA Stats

seq_data <- clean_data %>% 
  filter(!str_detect(name, "non")) %>% 
  filter(!str_detect(name, "p_"))
  
#creating an object for each species 
poci_seqs <- seq_data %>% filter(coral_genus == "Pocillopora")
pver_seqs <- seq_data %>% filter(str_detect(mtORF, "Pverrucosa"))
pmh8_seqs <- seq_data %>% filter(str_detect(mtORF, "Pmeandrina|Haplotype8a"))
acro_seqs <- seq_data %>% filter(str_detect(mtORF, "Ahumilis"))

Fig2a

#read in file 
fasta_poci <- read_fasta_df("20210612_marzonie/186_20211115_03_DBV_20211116T024440.seqs.fasta") %>%
  filter(label %in% poci_seqs$name) %>%   #only keeping DNA seqs that appear in seqs_long subset 
  filter(!str_detect(label, "A|G")) %>%
  deframe() %>%
  as_dna()

#creating the tree
kdist_poci <- fasta_poci %>%
  dna_to_DNAbin() %>%
  kdistance(k = 7, residues = "DNA", method = "edgar") %>%
  as.matrix()

k_tree_poci <- kdist_poci %>% phangorn::upgma()

seqs_wide_poci <- poci_seqs %>%
  dplyr::select(sample_name, name, value) %>%
  filter(!str_detect(name, "A|G")) %>%
  pivot_wider(names_from = name, values_from = value, values_fill = 0) %>%
  tibble::column_to_rownames(var = "sample_name")

k_unidist_poci <- GUniFrac(seqs_wide_poci, k_tree_poci)   #GUniFrac calculates all the distances 
k_unidist_poci <- k_unidist_poci$unifracs

du_poci <- k_unidist_poci[, , "d_0.5"]    # GUniFrac with alpha 0.5 
dist_poci <- as.dist(du_poci, diag = FALSE)

# Cluster the samples
hclust_samps_poci <- upgma(du_poci)

# Make the sample tree
tree_poci <- ggtree(hclust_samps_poci, size = 0.2) +
  theme(aspect.ratio = 0.3)

# Get a sample order from ggtree
poci_sample_order <- tree_poci$data %>% filter(isTip == "TRUE") %>%
  arrange(y) %>%
  pull(label)

pocimat <- meta %>% filter(sample_name %in% tree_poci$data$label) %>%
  select(sample_name, RFLP = Species, mtORF) %>%
  mutate(RFLP = case_when(RFLP == "Pmeandrina" ~ "P. meandrina",
                          RFLP == "Pverrucosa" ~ "P. verrucosa", TRUE ~ RFLP),
         mtORF = case_when(mtORF == "Pmeandrina" ~ "P. meandrina",
                           mtORF == "Pverrucosa" ~ "P. verrucosa", TRUE ~ mtORF)) %>%
  tibble::column_to_rownames(var = "sample_name")

poci_tree_mat <- gheatmap(tree_poci, pocimat, colnames = T, width=.1, offset = -0.26)
poci_tree_mat <- poci_tree_mat + scale_fill_d3(palette = "category10", na.value = "grey90") + layout_dendrogram() + theme(legend.position = "left")

# Start plotting the composition data
plot_df_poci <- clean_data %>%
  filter(coral_genus == "Pocillopora") %>%
  mutate(sample_name = fct_relevel(sample_name, poci_sample_order))

theme_set(theme_bw())

# find the likely distinguishing seqs in here
test_df <- pmh8_seqs %>%
   group_by(name) %>%
   summarise(mean = mean(value_rel), n = n()) %>%
   arrange(desc(n), desc(mean))

# colour them black to check
 test_pal <- all_pal
 test_pal['C42g'] <- "#000000" 

bar_uni_poci <- 
ggplot(plot_df_poci, aes(sample_name, value_rel)) +
geom_bar(stat = "identity", aes(fill = name, colour = name)) +
theme(aspect.ratio = 0.5, legend.position = "none", axis.text.y=element_blank(), axis.ticks.y = element_blank(),
      axis.text.x=element_blank(), axis.ticks.x = element_blank(),
      axis.title.x = element_blank(), axis.title.y = element_blank(),
      panel.border = element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), axis.ticks = element_blank()) +
scale_fill_manual(values = all_pal, breaks = levels(profile_data$name)) +
scale_colour_manual(values = test_pal, breaks = levels(profile_data$name)) +
geom_hline(yintercept = 1, size = 1) +
guides(fill=guide_legend(ncol=2))

bar_uni_poci

#p_bar_uni is the sequences by colour. P_tree_tip is the tree coloured by reef.

poci_tree_mat / bar_uni_poci

Fig 2B Acropora Symportal data

OBSERVATIONS: Regardless of the tree-type and unifrac combination, there is one major branch point in the tree. While this does not seem to be consistent with lat/lon, the unweighted approach seems very good at placing samples from the same lat/lons together.

#read in file 
fasta_acro <- read_fasta_df("20210612_marzonie/186_20211115_03_DBV_20211116T024440.seqs.fasta") %>%
  filter(label %in% acro_seqs$name) %>%   #only keeping DNA seqs that appear in seqs_long subset 
  filter(str_sub(label, 1, 1) == "C" | str_detect(label, "_C")) %>%
  deframe() %>%
  as_dna()

# Unifracs With a kmer-based tree

kdist_acro <- fasta_acro %>%
  dna_to_DNAbin() %>%
  kdistance(k = 7, residues = "DNA", method = "edgar") %>%
  as.matrix()

k_tree_acro <- kdist_acro %>% phangorn::upgma()

# Get the community matrix
seqs_wide_acro <- acro_seqs %>%
  dplyr::select(sample_name, name, value) %>%
  filter(str_sub(name, 1, 1) == "C" | str_detect(name, "_C")) %>%
  pivot_wider(names_from = name, values_from = value, values_fill = 0) %>%
  tibble::column_to_rownames(var = "sample_name")

# comput unifrac distances  
k_unidist_acro <- GUniFrac(seqs_wide_acro, k_tree_acro)   #GUniFrac calculates all the distances 
k_unidist_acro <- k_unidist_acro$unifracs

du_acro <- k_unidist_acro[, , "d_0.5"]    # GUniFrac with alpha 0.5
dist_acro <- as.dist(du_acro, diag = FALSE)


# Cluster the samples
hclust_samps_acro <- upgma(du_acro)

# Make the sample tree
tree_acro <- ggtree(hclust_samps_acro, size = 0.2) +
  theme(aspect.ratio = 0.3) + layout_dendrogram()

# Get a sample order from ggtree
acro_sample_order <- tree_acro$data %>% filter(isTip == "TRUE") %>%
  arrange(y) %>%
  pull(label)

# Start plotting the composition data
plot_df_acro <- clean_data %>%
  filter(str_detect(Species, "Ahumilis")) %>%
  mutate(sample_name = fct_relevel(sample_name, acro_sample_order))

theme_set(theme_bw())


# find the likely distinguishing seqs in here
test_df <- acro_seqs %>%
   group_by(name) %>%
   summarise(mean = mean(value_rel), n = n()) %>%
   arrange(desc(n), desc(mean))

# colour them black to check
 test_pal <- all_pal
 test_pal['C1c'] <- "#000000" 

bar_uni_acro <- 
ggplot(plot_df_acro, aes(sample_name, value_rel)) +
geom_bar(stat = "identity", aes(fill = name, colour = name)) +
theme(aspect.ratio = 0.5, legend.position = "none", axis.text.y=element_blank(), axis.ticks.y = element_blank(),
      axis.text.x=element_blank(), axis.ticks.x = element_blank(),
      axis.title.x = element_blank(), axis.title.y = element_blank(),
      panel.border = element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), axis.ticks = element_blank()) +
scale_fill_manual(values = test_pal, breaks = levels(profile_data$name)) +
scale_colour_manual(values = test_pal, breaks = levels(profile_data$name)) +
geom_hline(yintercept = 1, size = 1) +
guides(fill=guide_legend(ncol=2))

#p_bar_uni is the sequences by colour. P_tree_tip is the tree coloured by reef. 

tree_acro / bar_uni_acro

Pver distance matrix

fasta_pver <- read_fasta_df("20210612_marzonie/186_20211115_03_DBV_20211116T024440.seqs.fasta") %>%
  filter(label %in% pver_seqs$name) %>%   #only keeping DNA seqs that appear in seqs_long subset 
  filter(!str_detect(label, "A|G")) %>%
  deframe() %>%
  as_dna()

#creating the tree
kdist_pver <- fasta_pver %>%
  dna_to_DNAbin() %>%
  kdistance(k = 7, residues = "DNA", method = "edgar") %>%
  as.matrix()

k_tree_pver <- kdist_pver %>% phangorn::upgma()

seqs_wide_pver <- pver_seqs %>%
  dplyr::select(sample_name, name, value) %>%
  filter(!str_detect(name, "A|G")) %>%
  pivot_wider(names_from = name, values_from = value, values_fill = 0) %>%
  tibble::column_to_rownames(var = "sample_name")

k_unidist_pver <- GUniFrac(seqs_wide_pver, k_tree_pver)   #GUniFrac calculates all the distances 
k_unidist_pver <- k_unidist_pver$unifracs

du_pver <- k_unidist_pver[, , "d_0.5"]    # GUniFrac with alpha 0.5 
dist_pver <- as.dist(du_pver, diag = FALSE)

Pmean/hap8a dist

fasta_pmh8 <- read_fasta_df("20210612_marzonie/186_20211115_03_DBV_20211116T024440.seqs.fasta") %>%
  filter(label %in% pmh8_seqs$name) %>%   #only keeping DNA seqs that appear in seqs_long subset 
  filter(!str_detect(label, "A|G")) %>%
  deframe() %>%
  as_dna()

#creating the tree
kdist_pmh8 <- fasta_pmh8 %>%
  dna_to_DNAbin() %>%
  kdistance(k = 7, residues = "DNA", method = "edgar") %>%
  as.matrix()

k_tree_pmh8 <- kdist_pmh8 %>% phangorn::upgma()

seqs_wide_pmh8 <- pmh8_seqs %>%
  dplyr::select(sample_name, name, value) %>%
  filter(!str_detect(name, "A|G")) %>%
  pivot_wider(names_from = name, values_from = value, values_fill = 0) %>%
  tibble::column_to_rownames(var = "sample_name")

k_unidist_pmh8 <- GUniFrac(seqs_wide_pmh8, k_tree_pmh8)   # GUniFrac calculates all the distances 
k_unidist_pmh8 <- k_unidist_pmh8$unifracs

du_pmh8 <- k_unidist_pmh8[, , "d_0.5"]    # GUniFrac with alpha 0.5 
dist_pmh8 <- as.dist(du_pmh8, diag = FALSE)

psbAncr

#psba txt files forward and reverse
f_all <- sort(list.files(path = "psba txt files/", pattern = "Forw.txt", full.names = TRUE))

r_all <- sort(list.files(path = "psba txt files/", pattern = "Rev.txt", full.names = TRUE))


f_all_fasta <- f_all %>%
  map(read_fasta_df) %>% # read in all the files
  purrr::reduce(rbind) # reduce with rbind into one dataframe

r_all_fasta <- r_all %>%
  map(read_fasta_df) %>% # read in all the files
  purrr::reduce(rbind) # reduce with rbind into one dataframe
# apply the trimming parameters to start and end of the sequences to improve consensus

f_all_fasta_sub <- f_all_fasta %>%
  mutate(sequence = case_when(str_detect(label, "psba1_A12") ~ str_sub(sequence, start = 30, end = 550),
                              str_detect(label, "psba1_B02") ~ str_sub(sequence, start = 30, end = 700),
                              str_detect(label, "psba1_B07") ~ str_sub(sequence, start = 30, end = 600),
                              str_detect(label, "psba1_B09") ~ str_sub(sequence, start = 30, end = 650),
                              str_detect(label, "psba1_B09") ~ str_sub(sequence, start = 30, end = 600),
                              str_detect(label, "psba1_B10") ~ str_sub(sequence, start = 30, end = 650),
                              str_detect(label, "psba1_C12") ~ str_sub(sequence, start = 30, end = 500),
                              str_detect(label, "psba1_D10") ~ str_sub(sequence, start = 30, end = 500),
                              str_detect(label, "psba1_D11") ~ str_sub(sequence, start = 30, end = 545),
           TRUE ~ str_sub(sequence, start = 30, end = 800)))

r_all_fasta_sub <- r_all_fasta %>%
  mutate(sequence = case_when(str_detect(label, "psba1_A04") ~ str_sub(sequence, start = 30, end = 450),
                              str_detect(label, "psba1_A06") ~ str_sub(sequence, start = 30, end = 400),
                              str_detect(label, "psba1_A07") ~ str_sub(sequence, start = 30, end = 500),
                              str_detect(label, "psba1_A08") ~ str_sub(sequence, start = 30, end = 450),
                              str_detect(label, "psba1_A08") ~ str_sub(sequence, start = 30, end = 500),
                              str_detect(label, "psba1_A09") ~ str_sub(sequence, start = 30, end = 500),
                              str_detect(label, "psba1_A12") ~ str_sub(sequence, start = 30, end = 550),
                              str_detect(label, "psba1_B02") ~ str_sub(sequence, start = 30, end = 300),
                              str_detect(label, "psba1_B03") ~ str_sub(sequence, start = 30, end = 500),
                              str_detect(label, "psba1_B06") ~ str_sub(sequence, start = 30, end = 500),
                              str_detect(label, "psba1_B09") ~ str_sub(sequence, start = 30, end = 450),
                              str_detect(label, "psba1_B10") ~ str_sub(sequence, start = 30, end = 450),
                              str_detect(label, "psba1_B11") ~ str_sub(sequence, start = 30, end = 450),
                              str_detect(label, "psba1_C04") ~ str_sub(sequence, start = 30, end = 225),
                              str_detect(label, "psba1_D05") ~ str_sub(sequence, start = 30, end = 300),
                              str_detect(label, "psba1_D11") ~ str_sub(sequence, start = 30, end = 425),
                              str_detect(label, "psba1_E03") ~ str_sub(sequence, start = 30, end = 550),
                              str_detect(label, "psba1_F02") ~ str_sub(sequence, start = 30, end = 500),
                              str_detect(label, "psba1_F08") ~ str_sub(sequence, start = 30, end = 500),
                              str_detect(label, "psba1_G01") ~ str_sub(sequence, start = 30, end = 300),
                              str_detect(label, "psba1_H01") ~ str_sub(sequence, start = 30, end = 550),
                              str_detect(label, "psba1_H02") ~ str_sub(sequence, start = 30, end = 500),
                              str_detect(label, "psba1_H10") ~ str_sub(sequence, start = 30, end = 450),
                              TRUE ~ str_sub(sequence, start = 30, end = 600)))

# convert to format for DECIPHER and reverse complement the reverse sequence

f_dss <- f_all_fasta_sub %>%
  deframe() %>%
  as_dna() %>%
  dna_to_DNAStringset()

r_dss <- r_all_fasta_sub %>%
  deframe() %>%
  as_dna() %>%
  dna_to_DNAStringset() %>%
  reverseComplement()

pair_list <- list()
for(i in 1:length(f_dss)){
  ss <- DNAStringSet(c(f_dss[i], r_dss[i]))
  pair_list[[i]] <- ss
}

# Align the sample pairs
alignment_list <- list()
for(i in 1:length(pair_list)){
  alignment <- AlignSeqs(pair_list[[i]], verbose = FALSE)
  alignment_list[[i]] <- alignment
}

# View the alignments
aligned_df <- data.frame()
for(i in 1:length(alignment_list)){
  a_pair <- alignment_list[[i]] %>% writeXStringSet("temp_file.fasta")
  a_df <- read_fasta_df("temp_file.fasta")
  aligned_df <- rbind(aligned_df, a_df)
}

aligned_plotting <- aligned_df %>%
  mutate(sample_id = str_sub(label, 23, 25)) %>% # create a metadata column that identifies the sample
  mutate(label = str_sub(label, 23, 31))
  
# Create profile key
key <- aligned_plotting %>%
  tibble::rownames_to_column(var = "id")

# Create long dataframe for ggplot
long_sequences <- str_split(aligned_plotting$sequence, "") %>%
  reshape2::melt() %>%
  group_by(L1) %>%
  mutate(x = row_number(),
         L1 = as.character(L1)) %>%
  left_join(., key, by = c("L1" = "id")) %>%
  ungroup()

# Plot alignment
ggplot(long_sequences, aes(y = label, x = x)) +
      geom_tile(aes(fill = value), size = 1, name = "base") +
      facet_wrap(~ sample_id, nrow = 12, scales = "free_y") +
      scale_fill_manual(values = palette) +
      theme(aspect.ratio = 0.3,
            axis.title.y = element_blank()) +
      scale_x_continuous(expand = c(0, 0)) +
    xlab("Position")

# Sample list that we may want to exclude based on f and r alignments: B12, C04, G01, D06, D11
exclude_list <- c("B12", "D06", "C04", "F02", "B02", "B03")

align_list_final <- list()
for(i in 1:length(alignment_list)){
  if(str_sub(names(alignment_list[[i]][1]), 23, 25) %in% exclude_list) align_list_final[[i]] <- NULL else align_list_final[[i]] <- alignment_list[[i]]
}

align_list_final <- Filter(Negate(is.null), align_list_final)

Create the consensus sequences

consensus_df <- list()
for(i in 1:length(align_list_final)){
  a_con <- align_list_final[[i]] %>% ConsensusSequence()
  names(a_con) <- names(align_list_final[[i]])[1]
  consensus_df[[i]] <- a_con
}

Pocillopora

pocillo_meta <- meta %>%
  filter(mtORF == "Pmeandrina" | mtORF == "Pverrucosa" | mtORF == "Haplotype8a") %>%
  filter(str_detect(psba_ID, "psb"))

pocillo_names <- pocillo_meta %>% pull(psba_ID)

pocillo_cons <- list()
for(i in 1:length(consensus_df)){
  if(str_sub(names(consensus_df[[i]]), 17, 25) %in% pocillo_names) pocillo_cons[[i]] <- consensus_df[[i]] else pocillo_cons[[i]] <- NULL
}

# Keep only the pocillo psba sequences
pocillo_cons <- Filter(Negate(is.null), pocillo_cons)

Generate the ITS2 and psbA dist mats

# Get consensus seqs
pocillo_df <- data.frame()
for(i in 1:length(pocillo_cons)){
  pocillo_cons[[i]] %>% writeXStringSet("temp_file.fasta")
  t_df <- read_fasta_df("temp_file.fasta")
  pocillo_df <- rbind(pocillo_df, t_df)
}

# Kmer based distance matrix
dis_psbaID <- pocillo_df %>%
  mutate(psba_ID = str_sub(label, 17, 25)) %>%
  left_join(., pocillo_meta) %>%
  select(sample_name, sequence) %>%
  deframe() %>%
  DNAStringSet() %>%
  DNAStringSet_to_DNAbin() %>%
  kmer::kdistance(k = 7) %>%
  as.matrix()

# Produce hierarchical tree
psba_upgma_tree <- phangorn::upgma(dis_psbaID)

# Subset the full matrix
du_poci_sub <- dist_subset(du_poci, psba_upgma_tree$tip.label) %>%
  as.matrix()

# Produce hierarchical tree
ITS2_upgma_tree <- phangorn::upgma(du_poci_sub)

Pocillopora ITS2 Tanglegram

untang <- untangle(dend1 = ITS2_upgma_tree %>% as.dendrogram(), dend2 = psba_upgma_tree %>% as.dendrogram, method = "step2side")
tgram <- tanglegram(untang, margin_inner = 8)

entanglement(tgram) # 0.01193893
## [1] 0.01193893
t_order <- labels(tgram$dend1)
s_order <- labels(tgram$dend2)

tt <- as.phylo(tgram$dend1)
tt <- midpoint(tt)

st <- as.phylo(tgram$dend2)
st <- midpoint(st)

tree_meta <- pocillo_meta %>%
  filter(sample_name %in% psba_upgma_tree$tip.label) %>%
  mutate(new_lab = paste0(sample_name, "_", mtORF))

gtt <- ggtree(tt, ladderize = FALSE) 
gtt$data <- gtt$data %>% left_join(., tree_meta, by = c("label" = "sample_name"))

gtt <- gtt + geom_tiplab(size = 3, aes(label = label)) + geom_tippoint(aes(fill = mtORF), shape = 21, size = 5) + ggsci::scale_fill_d3(palette = "category20")
gst <- ggtree(st, ladderize = FALSE)

dtt <- gtt$data
dst <- gst$data

dtt$tree <- "ITS2"
dst$tree <- "psbA"

dst$x <- max(dst$x) - dst$x + max(dtt$x) + max(dtt$x) * 1

pp <- gtt + geom_tree(data = dst)

dd <- bind_rows(dtt, dst) %>%
  filter(isTip == TRUE)

dd1 <- as.data.frame(dd)

p_ip <- pp + geom_line(aes(x, y, group = label), size = 1.2, data = dd1) +
  ggsci::scale_colour_d3(palette = "category20") +
  theme(legend.position = "none")

p_ip

Acropora

acro_meta <- meta %>%
  filter(mtORF == "Ahumilis") %>%
  filter(str_detect(psba_ID, "psb"))

acro_names <- acro_meta %>% pull(psba_ID)

acro_cons <- list()
for(i in 1:length(consensus_df)){
  if(str_sub(names(consensus_df[[i]]), 17, 25) %in% acro_names) acro_cons[[i]] <- consensus_df[[i]] else acro_cons[[i]] <- NULL
}

# Keep only the pocillo psba sequences
acro_cons <- Filter(Negate(is.null), acro_cons)

Generate the ITS2 and psbA dist mats

# Get consensus seqs
acro_df <- data.frame()
for(i in 1:length(acro_cons)){
  acro_cons[[i]] %>% writeXStringSet("temp_file.fasta")
  t_df <- read_fasta_df("temp_file.fasta")
  acro_df <- rbind(acro_df, t_df)
}

# Kmer based distance matrix
dis_psbaID <- acro_df %>%
  mutate(psba_ID = str_sub(label, 17, 25)) %>%
  left_join(., acro_meta) %>%
  select(sample_name, sequence) %>%
  deframe() %>%
  DNAStringSet() %>%
  DNAStringSet_to_DNAbin() %>%
  kmer::kdistance(k = 7) %>%
  as.matrix()

# Produce hierarchical tree
psba_upgma_tree <- phangorn::upgma(dis_psbaID)

# Subset the full matrix
du_acro_sub <- dist_subset(du_acro, psba_upgma_tree$tip.label) %>%
  as.matrix()

# Produce hierarchical tree
ITS2_upgma_tree <- phangorn::upgma(du_acro_sub)

Acropora ITS2 Tanglegram

untang <- untangle(dend1 = ITS2_upgma_tree %>% as.dendrogram(), dend2 = psba_upgma_tree %>% as.dendrogram, method = "step2side")
tgram <- tanglegram(untang, margin_inner = 8)
entanglement(tgram) # 0.1986148
## [1] 0.1986148
t_order <- labels(tgram$dend1)
s_order <- labels(tgram$dend2)

tt <- as.phylo(tgram$dend1)
tt <- midpoint(tt)

st <- as.phylo(tgram$dend2)
st <- midpoint(st)

tree_meta <- acro_meta %>%
  filter(sample_name %in% psba_upgma_tree$tip.label) %>%
  mutate(new_lab = paste0(sample_name, "_", mtORF))

gtt <- ggtree(tt, ladderize = FALSE) 
gtt$data <- gtt$data %>% left_join(., tree_meta, by = c("label" = "sample_name"))

gtt <- gtt + geom_tiplab(size = 3, aes(label = label))
gst <- ggtree(st, ladderize = FALSE)

dtt <- gtt$data
dst <- gst$data

dtt$tree <- "ITS2"
dst$tree <- "psbA"

dst$x <- max(dst$x) - dst$x + max(dtt$x) + max(dtt$x) * 1

pp <- gtt + geom_tree(data = dst)

dd <- bind_rows(dtt, dst) %>%
  filter(isTip == TRUE)

dd1 <- as.data.frame(dd)

p_ip <- pp + geom_line(aes(x, y, group = label), size = 1.2, data = dd1)

tgram <- tanglegram(untang, margin_inner = 8)

p_ip

4.1 PCoA

Here we are running a PCoA on UniFrac distances for the three communities associated with three different host groups (Pocillopora verrucosa, P. meandrina, and Acropora humilis) We are looking for patterns by reef here

Pver PCoA

reef_order <- meta %>% distinct(Reef, Lat) %>%
  group_by(Reef) %>%
  summarise(Lat = mean(Lat)) %>%
  arrange(Lat) %>%
  pull(Reef)

pcoa_pver <- cmdscale(dist_pver, eig = TRUE) #this is doing 'cmds' or classic multidimensional scaling

ordiplot(pcoa_pver, display = 'sites', type = 'text')

barplot(pcoa_pver$eig, names = paste ('PCoA', 1:152), las = 3, ylab = 'eigenvalues')

MDSxy.pver <- data.frame(pcoa_pver$points) %>% 
  rownames_to_column(var = "sample_name") %>% 
  left_join(., meta)

pverPCA <- MDSxy.pver %>%
  mutate(Reef = fct_relevel(Reef, reef_order)) %>%
  ggplot(aes(X1, X2, fill = Reef)) + 
  geom_point(alpha = 1, shape = 21, size = 3) + 
  scale_fill_viridis_d(option = "magma", direction = -1)


pverPCA

Pmea PCoA

pcoa_pmh8 <- cmdscale(dist_pmh8, eig = TRUE)    #this is doing 'cmds' or classic multidimensional scaling

ordiplot(pcoa_pmh8, display = 'sites', type = 'text')

barplot(pcoa_pmh8$eig, names = paste ('PCoA', 1:172), las = 3, ylab = 'eigenvalues')

MDSxy.pmh8 <- data.frame(pcoa_pmh8$points) %>% 
  rownames_to_column(var = "sample_name") %>% 
  left_join(., meta)

pmh8PCA <- MDSxy.pmh8 %>%
  mutate(Reef = fct_relevel(Reef, reef_order)) %>%
  ggplot(aes(X1, X2, fill = Reef)) + 
  geom_point(alpha = 1, shape = 21, size = 3) + 
  scale_fill_viridis_d(option = "magma", direction = -1)

pmh8PCA

Ahum PCoA

pcoa_acro <- cmdscale(dist_acro, eig = TRUE)    #this is doing 'cmds' or classic multidimensional scaling

ordiplot(pcoa_acro, display = 'sites', type = 'text')

barplot (pcoa_acro$eig, names = paste ('PCoA', 1:260), las = 3, ylab = 'eigenvalues')

MDSxy.acro <- data.frame(pcoa_acro$points) %>% 
  rownames_to_column(var = "sample_name") %>% 
  left_join(., meta)

acroPCA <- MDSxy.acro %>%
  mutate(Reef = fct_relevel(Reef, reef_order)) %>%
  ggplot(aes(X1, X2, fill = Reef)) + 
  geom_point(alpha = 1, shape = 21, size = 3) + 
  #ggsci::scale_fill_d3(palette = "category20") #+
  scale_fill_viridis_d(option = "magma", direction = -1)
 # scale_fill_manual(values = reef_pal) #+
 #theme(legend.position = "none", aspect.ratio = 1, text = element_text(size = 15))

acroPCA

pverPCA + pmh8PCA + acroPCA

Fig SX. PCoA of each species symbiont communities

4.2 dbRDA

Pver dbRDA

#importing unifrac distances for analysis 
meta_pver <- meta %>%
  filter(mtORF == "Pverrucosa") %>%
  filter(sample_name %in% keepers_ss$sample_name) %>%
  mutate(catBleaching = as.numeric(catBleaching)) %>%
  tibble::column_to_rownames(var = "sample_name") %>%
  mutate(sample_name = rownames(.))

dist_pver_ordered <- dist_subset(dist_pver, rownames(meta_pver))


# Make a correlation matrix of numeric variables
cm <- cor(meta_pver %>% select(Depth, maxDHW, meanDHW, recent.maxDHW, 
                                      recent.meanDHW, DHW3, DHW4, DHW6, DHW8, 
                                      DHW9, rangeSST, varSST, MMM, Lat))

corrplot(cm) # Clearly a lot of the DHW terms are positively related and approaching redundant

cm %>% cor()
##                      Depth     maxDHW    meanDHW recent.maxDHW recent.meanDHW
## Depth           1.00000000 -0.3747247 -0.1815539   -0.29551269      0.1602795
## maxDHW         -0.37472467  1.0000000  0.5774847    0.97359709      0.1838974
## meanDHW        -0.18155391  0.5774847  1.0000000    0.65767097      0.7855995
## recent.maxDHW  -0.29551269  0.9735971  0.6576710    1.00000000      0.3590438
## recent.meanDHW  0.16027947  0.1838974  0.7855995    0.35904376      1.0000000
## DHW3           -0.24926416  0.5463224  0.8962680    0.59328239      0.5323811
## DHW4           -0.36324104  0.8192436  0.8026414    0.80591540      0.3158608
## DHW6           -0.02146289  0.4397345  0.9134538    0.55363040      0.8476837
## DHW8           -0.09437930  0.3798900  0.9489017    0.49383156      0.9000039
## DHW9           -0.07846512  0.1613627  0.8756751    0.26558333      0.8749837
## rangeSST        0.01304759 -0.5078297 -0.8707069   -0.64716282     -0.9279753
## varSST         -0.04530179 -0.4159139 -0.8598618   -0.56570304     -0.9602305
## MMM             0.39542721 -0.4244711  0.2864056   -0.24602109      0.7857945
## Lat            -0.30686699  0.2075915 -0.5125898    0.02496643     -0.9109919
##                       DHW3        DHW4        DHW6       DHW8        DHW9
## Depth          -0.24926416 -0.36324104 -0.02146289 -0.0943793 -0.07846512
## maxDHW          0.54632238  0.81924362  0.43973446  0.3798900  0.16136266
## meanDHW         0.89626804  0.80264144  0.91345377  0.9489017  0.87567507
## recent.maxDHW   0.59328239  0.80591540  0.55363040  0.4938316  0.26558333
## recent.meanDHW  0.53238105  0.31586079  0.84768366  0.9000039  0.87498366
## DHW3            1.00000000  0.85552516  0.73344231  0.7551756  0.68912001
## DHW4            0.85552516  1.00000000  0.69712721  0.6244913  0.45419667
## DHW6            0.73344231  0.69712721  1.00000000  0.9476503  0.84778489
## DHW8            0.75517560  0.62449133  0.94765025  1.0000000  0.94959759
## DHW9            0.68912001  0.45419667  0.84778489  0.9495976  1.00000000
## rangeSST       -0.61740960 -0.56154282 -0.90049909 -0.9211093 -0.80516316
## varSST         -0.59834596 -0.50642870 -0.91139180 -0.9342055 -0.83994732
## MMM             0.03286723 -0.31572098  0.41847542  0.5158353  0.63053196
## Lat            -0.25297629  0.07931285 -0.60053164 -0.7026860 -0.77933963
##                   rangeSST      varSST         MMM         Lat
## Depth           0.01304759 -0.04530179  0.39542721 -0.30686699
## maxDHW         -0.50782966 -0.41591386 -0.42447110  0.20759147
## meanDHW        -0.87070694 -0.85986184  0.28640564 -0.51258976
## recent.maxDHW  -0.64716282 -0.56570304 -0.24602109  0.02496643
## recent.meanDHW -0.92797525 -0.96023048  0.78579452 -0.91099192
## DHW3           -0.61740960 -0.59834596  0.03286723 -0.25297629
## DHW4           -0.56154282 -0.50642870 -0.31572098  0.07931285
## DHW6           -0.90049909 -0.91139180  0.41847542 -0.60053164
## DHW8           -0.92110935 -0.93420546  0.51583531 -0.70268602
## DHW9           -0.80516316 -0.83994732  0.63053196 -0.77933963
## rangeSST        1.00000000  0.99373739 -0.52353597  0.70620277
## varSST          0.99373739  1.00000000 -0.60278844  0.77070312
## MMM            -0.52353597 -0.60278844  1.00000000 -0.96777268
## Lat             0.70620277  0.77070312 -0.96777268  1.00000000
# Check the vif scores of the full model
ord_pver_full <- dbrda(dist_pver_ordered ~ Depth + maxDHW + meanDHW + recent.maxDHW + 
                                      recent.meanDHW + DHW3 + DHW4 + DHW6 + DHW8 + 
                                      DHW9 + rangeSST + varSST + MMM + Lat + catBleaching, data = meta_pver)

sort(vif.cca(ord_pver_full)) 
##   catBleaching          Depth           DHW6           DHW3  recent.maxDHW 
##       1.111768       1.515812       8.142266       9.271732      10.991559 
##           DHW8         maxDHW           DHW9           DHW4        meanDHW 
##      11.098885      15.964914      21.303216      30.863670      58.933962 
##       rangeSST         varSST            Lat recent.meanDHW            MMM 
##      65.268575     134.814585     188.594615     190.309030     419.108422
#many correlated thermal history variables. We can definitely keep: depth, catBleaching. 
#keep lat, remove MMM -> one variable to reflect latitudinal gardient 
#keep varSST, remove rangeSST -> one variable to reflect variation throughout the year
#keep recent.maxDHW -> one variable to reflect recent thermal history (would expect to shape more than long-term)

# Reduce the model (consult the corplot and vif scores)
ord_pver <- dbrda(dist_pver_ordered ~ Lat + Depth + catBleaching + varSST + recent.maxDHW, data = meta_pver)

# Re check the new vif scores
sort(vif.cca(ord_pver))
##  catBleaching         Depth recent.maxDHW           Lat        varSST 
##      1.023056      1.044675      1.480152      1.993197      2.225752
# Use ordistep to further refine the model
os_pver_backward <- ordistep(ord_pver, direction = "backward", permutations = 999)
## 
## Start: dist_pver_ordered ~ Lat + Depth + catBleaching + varSST + recent.maxDHW 
## 
##                 Df    AIC      F Pr(>F)  
## - catBleaching   1 223.80 0.9427  0.459  
## - varSST         1 223.87 1.0134  0.383  
## - Lat            1 224.03 1.1628  0.293  
## - recent.maxDHW  1 224.45 1.5702  0.125  
## - Depth          1 225.09 2.1917  0.028 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step: dist_pver_ordered ~ Lat + Depth + varSST + recent.maxDHW 
## 
##                 Df    AIC      F Pr(>F)  
## - varSST         1 222.80 0.9729  0.404  
## - Lat            1 223.16 1.3210  0.202  
## - recent.maxDHW  1 223.32 1.4789  0.156  
## - Depth          1 224.06 2.2064  0.035 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step: dist_pver_ordered ~ Lat + Depth + recent.maxDHW 
## 
##                 Df    AIC      F Pr(>F)   
## - recent.maxDHW  1 222.45 1.6085  0.110   
## - Depth          1 223.06 2.2115  0.031 * 
## - Lat            1 223.62 2.7648  0.010 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step: dist_pver_ordered ~ Lat + Depth 
## 
##         Df    AIC      F Pr(>F)   
## - Depth  1 222.76 2.2809  0.028 * 
## - Lat    1 223.62 3.1452  0.009 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(os_pver_backward, by = 'margin')
## Permutation test for dbrda under reduced model
## Marginal effects of terms
## Permutation: free
## Number of permutations: 999
## 
## Model: dbrda(formula = dist_pver_ordered ~ Lat + Depth, data = meta_pver)
##           Df SumOfSqs      F Pr(>F)  
## Lat        1   0.0883 3.1452  0.012 *
## Depth      1   0.0640 2.2809  0.025 *
## Residual 149   4.1812                
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Permutation test for dbrda under NA model
# Marginal effects of terms
# Permutation: free
# Number of permutations: 999
# 
# Model: dbrda(formula = dist_pver ~ Lat, data = meta_pver)
#           Df SumOfSqs      F Pr(>F)  
# Lat        1   0.0494 1.7237  0.086 .
# Residual 150   4.3017                
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1


#variance partitioning to see how much variation is explained by each factor 
varp <- varpart(dist_pver_ordered, ~Lat, ~Depth, ~recent.maxDHW, data = meta_pver)
plot(varp, digits = 2, Xnames = c("Lat", "Depth", "recent.maxDHW"), bg = c("navy", "tomato", "orange"))

#extract dbRDA scores
pver_scores <- as.data.frame(scores(os_pver_backward, display = "sites")) %>%
  tibble::rownames_to_column(var = "sample_name") %>%
  left_join(., meta_pver)

#extract dbRDA vectors
pver_vectors <- as.data.frame(os_pver_backward$CCA$biplot) %>%
  tibble::rownames_to_column(var = "factors")

#produce dbRDA plot 
pver_fullrda <- pver_scores %>%
  mutate(Reef = fct_relevel(Reef, reef_order)) %>%
    ggplot(aes(x = dbRDA1, y = dbRDA2)) +
      geom_point(aes(fill = Lat), size = 4, shape = 21) +
      geom_label_repel(data = pver_vectors, aes(x = dbRDA1, y = dbRDA2, label = factors), box.padding = 0.75, size = 3, segment.colour = NA) +
      geom_segment(data = pver_vectors, aes(x = 0, xend = dbRDA1, y = 0, yend = dbRDA2), 
      size = 0.5, arrow = arrow(length = unit(0.25, "cm")), colour = "grey") +
      theme(legend.position = "right", aspect.ratio = 1, text = element_text(size = 15)) +
      scale_fill_viridis_c(option = "magma", direction = -1) +
   theme(legend.position = "none", aspect.ratio = 1, text = element_text(size = 15))


pver_fullrda

Pmea Hap8 dbRDA

#importing unifrac distances for analysis 
meta_pmh8 <- meta %>%
  filter(mtORF == "Pmeandrina" | mtORF == "Haplotype8a") %>%
  filter(sample_name %in% keepers_ss$sample_name) %>%
  mutate(catBleaching = as.numeric(catBleaching)) %>%
  tibble::column_to_rownames(var = "sample_name") %>%
  mutate(sample_name = rownames(.))

dist_pmh8_ordered <- dist_subset(dist_pmh8, rownames(meta_pmh8))

# Make a correlation matrix of numeric variables
cm <- cor(meta_pmh8 %>% select(Depth, maxDHW, meanDHW, recent.maxDHW, 
                                recent.meanDHW, DHW3, DHW4, DHW6, DHW8, 
                                DHW9, rangeSST, varSST, MMM, Lat))

corrplot(cm) # Clearly a lot of the DHW terms are positively related and approaching redundant

# Check the vif scores of the full model
ord_pmh8_full <- dbrda(dist_pmh8_ordered ~ Depth + maxDHW + meanDHW + recent.maxDHW + 
                                      recent.meanDHW + DHW3 + DHW4 + DHW6 + DHW8 + 
                                      DHW9 + rangeSST + varSST + MMM + Lat + catBleaching, data = meta_pmh8)
sort(vif.cca(ord_pmh8_full))
##   catBleaching          Depth           DHW4           DHW6           DHW8 
##       1.188304       1.817157       5.924381       6.117623      10.000429 
##           DHW9         maxDHW           DHW3  recent.maxDHW        meanDHW 
##      10.132665      11.382971      12.028073      12.650027      50.092986 
##       rangeSST recent.meanDHW         varSST            Lat            MMM 
##      63.799068     115.231738     152.920843     200.478794     383.137823
#insert vif scores here 

# Reduce the model (consult the corplot and vif scores)
ord_pmh8 <- dbrda(dist_pmh8_ordered ~ Lat + Depth + catBleaching + varSST + recent.maxDHW, data = meta_pmh8)

# Re check the new vif scores
sort(vif.cca(ord_pmh8))
##  catBleaching         Depth recent.maxDHW        varSST           Lat 
##      1.121912      1.222941      1.335107      3.509340      4.322619
#insert vif scores here 

# Use ordistep to further refine the model
ord_pmh8_backward <- ordistep(ord_pmh8, direction = "backward", permutations = 999)
## 
## Start: dist_pmh8_ordered ~ Lat + Depth + catBleaching + varSST + recent.maxDHW 
## 
##                 Df    AIC       F Pr(>F)    
## - recent.maxDHW  1 258.30  0.7066  0.568    
## - Depth          1 259.28  1.6597  0.156    
## - catBleaching   1 263.93  6.2589  0.002 ** 
## - varSST         1 264.88  7.2119  0.001 ***
## - Lat            1 278.21 21.1719  0.001 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step: dist_pmh8_ordered ~ Lat + Depth + catBleaching + varSST 
## 
##                Df    AIC       F Pr(>F)    
## - Depth         1 258.44  2.0895  0.075 .  
## - catBleaching  1 262.67  6.3031  0.003 ** 
## - varSST        1 263.94  7.5903  0.001 ***
## - Lat           1 281.85 26.7498  0.001 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(ord_pmh8_backward, by = 'margin')
## Permutation test for dbrda under reduced model
## Marginal effects of terms
## Permutation: free
## Number of permutations: 999
## 
## Model: dbrda(formula = dist_pmh8_ordered ~ Lat + Depth + catBleaching + varSST, data = meta_pmh8)
##               Df SumOfSqs       F Pr(>F)    
## Lat            1   0.6825 26.7498  0.001 ***
## Depth          1   0.0533  2.0895  0.070 .  
## catBleaching   1   0.1608  6.3031  0.002 ** 
## varSST         1   0.1936  7.5903  0.001 ***
## Residual     167   4.2607                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#insert model outputs here 

#variance partitioning: how much does each variable explain in the model? residuals? 
varp <- varpart(dist_pmh8_ordered, ~Lat, ~catBleaching, ~varSST, data = meta_pmh8)
plot(varp, digits = 2, Xnames = c("Lat", "catBleaching", "varSST"), bg = c("navy", "tomato", "orange"))

#extract scores from dbRDA 
pmh8_scores <- as.data.frame(scores(ord_pmh8_backward, display = "sites")) %>%
  tibble::rownames_to_column(var = "sample_name") %>%
  left_join(., meta_pmh8)

#extract vectors from dbRDA 
pmh8_vectors <- as.data.frame(ord_pmh8_backward$CCA$biplot) %>%
  tibble::rownames_to_column(var = "factors")

#produce dbRDA plot 
pmh8_fullrda <- pmh8_scores %>%
  mutate(Reef = fct_relevel(Reef, reef_order)) %>%
    ggplot(aes(x = dbRDA1, y = dbRDA2)) +
      geom_point(aes(fill = Lat), size = 4, shape = 21) +
        geom_label_repel(data = pmh8_vectors, aes(x = dbRDA1, y = dbRDA2, label = factors), box.padding = 0.5, size = 3, segment.colour = NA) +
      geom_segment(data = pmh8_vectors, aes(x = 0, xend = dbRDA1, y = 0, yend = dbRDA2), 
      size = 0.5, arrow = arrow(length = unit(0.25, "cm")), colour = "grey") +
      theme(aspect.ratio = 1, text = element_text(size = 15)) +
      guides(fill = guide_colorbar (reverse = T)) +
      scale_fill_viridis_c(option = "magma", direction = -1) +
      theme(legend.position = "none", aspect.ratio = 1, text = element_text(size = 15))
pmh8_fullrda

Ahum dbRDA

library(ggrepel)
#identify 7 outlier samples and remove from data set
outlier_samples <- c("Plate6_D012", "Plate6_E009", "Plate6_H011", "Plate6_B012", "Plate7_D006", "Plate6_G009", "Plate7_H009", "Plate7_A010", "Plate6_D011")

#importing unifrac distances for analysis 
meta_acro <- meta %>%
  filter(mtORF == "Ahumilis") %>%
  filter(sample_name %in% keepers_ss$sample_name) %>%
  filter(!(sample_name %in% outlier_samples)) %>%
  mutate(catBleaching = as.factor(catBleaching)) %>%
  tibble::column_to_rownames(var = "sample_name") %>%
  mutate(sample_name = rownames(.))

dist_acro_ordered <- dist_subset(dist_acro, rownames(meta_acro))

# Make a correlation matrix of numeric variables
cm <- cor(meta_acro %>% select(Depth, maxDHW, meanDHW, recent.maxDHW, 
                                recent.meanDHW, DHW3, DHW4, DHW6, DHW8, 
                                DHW9, rangeSST, varSST, MMM, Lat))

corrplot(cm) # Clearly a lot of the DHW terms are positively related and approaching redundant

# Check the vif scores of the full model
ord_acro_full <- dbrda(dist_acro_ordered ~ Depth + maxDHW + meanDHW + recent.maxDHW + 
                                      recent.meanDHW + DHW3 + DHW4 + DHW6 + DHW8 + 
                                      DHW9 + rangeSST + varSST + MMM + Lat + catBleaching, data = meta_acro)
sort(vif.cca(ord_acro_full))
##  catBleaching6          Depth  catBleaching2  catBleaching3           DHW6 
##       1.435074       1.616944       2.773975       4.290302       5.117525 
##  catBleaching4  catBleaching5           DHW8           DHW3           DHW4 
##       5.440155       5.573908       6.273625       7.617778       8.934821 
##         maxDHW           DHW9  recent.maxDHW        meanDHW recent.meanDHW 
##      15.324583      15.526488      16.970046      33.230620     107.130974 
##       rangeSST         varSST            Lat            MMM 
##     164.878761     187.998043     795.502964     864.078437
#insert vif scores here 

# Reduce the model (consult the corplot and vif scores)
ord_acro <- dbrda(dist_acro_ordered ~  Lat + Depth + catBleaching + varSST + recent.maxDHW, data = meta_acro)

# Re check the new vif scores
sort(vif.cca(ord_acro))
##         Depth catBleaching6 recent.maxDHW        varSST catBleaching2 
##      1.178178      1.295819      1.728728      1.857405      2.494004 
##           Lat catBleaching3 catBleaching5 catBleaching4 
##      2.835694      3.412909      4.107573      4.268968
#insert vif scores here 

# Use ordistep to further refine the model
ord_acro_backward <- ordistep(ord_acro, direction = "backward", permutations = 999)
## 
## Start: dist_acro_ordered ~ Lat + Depth + catBleaching + varSST + recent.maxDHW 
## 
##                 Df    AIC       F Pr(>F)    
## - catBleaching   5 569.62  1.2385  0.212    
## - Depth          1 573.41  2.0894  0.063 .  
## - varSST         1 575.34  3.9582  0.008 ** 
## - Lat            1 578.53  7.0967  0.001 ***
## - recent.maxDHW  1 591.62 20.3815  0.001 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step: dist_acro_ordered ~ Lat + Depth + varSST + recent.maxDHW 
## 
##                 Df    AIC       F Pr(>F)    
## - Depth          1 569.80  2.1485  0.073 .  
## - varSST         1 571.56  3.8977  0.007 ** 
## - Lat            1 575.42  7.7651  0.001 ***
## - recent.maxDHW  1 587.07 19.8207  0.001 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(ord_acro_backward, by = 'margin')
## Permutation test for dbrda under reduced model
## Marginal effects of terms
## Permutation: free
## Number of permutations: 999
## 
## Model: dbrda(formula = dist_acro_ordered ~ Lat + Depth + varSST + recent.maxDHW, data = meta_acro)
##                Df SumOfSqs       F Pr(>F)    
## Lat             1   0.2946  7.7651  0.001 ***
## Depth           1   0.0815  2.1485  0.079 .  
## varSST          1   0.1479  3.8977  0.007 ** 
## recent.maxDHW   1   0.7520 19.8207  0.001 ***
## Residual      246   9.3328                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#insert model outputs here 

#variance partitioning: how much does each variable explain in the model? residuals? 
varp <- varpart(dist_acro_ordered, ~Lat, ~varSST, ~recent.maxDHW, data = meta_acro)
plot(varp, digits = 2, Xnames = c("Lat", "varSST", "recent.maxDHW"), bg = c("navy", "tomato", "orange"))

#extract scores from dbRDA 
acro_scores <- as.data.frame(scores(ord_acro_backward, display = "sites")) %>%
  tibble::rownames_to_column(var = "sample_name") %>%
  left_join(., meta_acro)

#extract vectors from dbRDA 
acro_vectors <- as.data.frame(ord_acro_backward$CCA$biplot) %>%
  tibble::rownames_to_column(var = "factors")

#produce dbRDA plot 
acro_fullrda <- acro_scores %>%
  mutate(Reef = fct_relevel(Reef, reef_order)) %>%
    ggplot(aes(x = dbRDA1, y = dbRDA2)) +
      geom_point(aes(fill = Lat), size = 4, shape = 21) +
      geom_label_repel(data = acro_vectors, aes(x = dbRDA1, y = dbRDA2, label = factors), box.padding = 1, size = 3, segment.colour = NA) +
      geom_segment(data = acro_vectors, aes(x = 0, xend = dbRDA1, y = 0, yend = dbRDA2), size = 0.5, arrow = arrow(length = unit(0.25, "cm")), colour = "grey") +
      theme(aspect.ratio = 1, text = element_text(size = 15)) +
  guides(fill = guide_colorbar (reverse = T)) +
scale_fill_viridis_c(option = "magma", direction = -1)
  #scale_fill_d3(palette = "category20")

acro_fullrda

Fig 4 dbRDA

dbrda <- pver_fullrda + pmh8_fullrda + acro_fullrda
dbrda

5.1 Network analysis

library(ggraph)
library(igraph)
library(tidyverse)
library(tidygraph)
library(assertthat)
library(purrr)
library(ggplot2)
library(dplyr)

Pver Network

#list of sequences per reef. 
pver_seqreef <- pver_seqs %>% 
distinct(name, Reef)

#removing sequences that are abundant across 5 or more reefs, as these sequences will not be 'diagnostic' if present across many reefs 
remove_list <- pver_seqreef %>% 
  filter(!str_detect(name, "A|G")) %>%
  dplyr::count(name) %>% 
  filter(n >= 11) %>% 
  mutate(name = as.character(name)) %>% 
  pull(name)

#within each reef, get the sum of sequences across all samples collected. 
pver_reefseqs = pver_seqs %>%  
 filter(!(name %in% remove_list)) %>%
  select(name, Reef, sample_name, value ) %>%
  group_by(Reef, name) %>%
  summarise(value = sum(value))  %>%
  select(from = name, to = Reef, value)

pver_network <- pver_reefseqs %>% 
  as_tbl_graph() %>% # convert to tidygraph format
   activate(nodes) %>%
   mutate(attribute = case_when(str_detect(name, "Bougainville|Chilcott|Flinders|Frederick|Herald|Holmes|Lihou|Marion|Moore|Osprey|Saumarez|Willis|Wreck") ~ "Reef", 
                                TRUE ~ "Sequence"), # add node attribute info
          text_label = case_when(attribute == "Reef" ~ name, TRUE ~ ""),
          centrality = centrality_degree()) %>% # can measure which nodes/reefs are 'central'
  ggraph(layout = "fr") +
   geom_edge_link(aes(width = value, alpha = value)) +
   scale_edge_width(range = c(0.2, 2.5)) + # control size
   geom_node_point(aes(shape = attribute, fill = centrality), size = 4) +
  geom_node_text(aes(label = text_label), repel = TRUE) +
   scale_shape_manual(values = c(21, 22)) +
  scale_fill_viridis_c(option = "magma")

pver_network

#results <- pver_network$data

Pmea Network

#list of sequences per reef. 
pmh8_seqreef <- pmh8_seqs %>% 
distinct(name, Reef)

#removing sequences that are abundant across 5 or more reefs, as these sequences will not be 'diagnostic' if present across many reefs 
remove_list <- pmh8_seqreef %>% 
  filter(!str_detect(name, "A|G")) %>%
  dplyr::count(name) %>% 
  filter(n >= 12) %>% 
  mutate(name = as.character(name)) %>% 
  pull(name)

#within each reef, get the sum of sequences across all samples collected. 
pmh8_reefseqs = pmh8_seqs %>%  
 filter(!(name %in% remove_list)) %>%
  select(name, Reef, sample_name, value ) %>%
  group_by(Reef, name) %>%
  summarise(value = sum(value))  %>%
  select(from = name, to = Reef, value)

pmh8_network <- pmh8_reefseqs %>% 
  as_tbl_graph() %>% # convert to tidygraph format
   activate(nodes) %>%
   mutate(attribute = case_when(str_detect(name, "Bougainville|Chilcott|Flinders|Frederick|Herald|Holmes|Lihou|Marion|Moore|Osprey|Saumarez|Willis|Wreck") ~ "Reef", 
                                TRUE ~ "Sequence"), # add node attribute info
          text_label = case_when(attribute == "Reef" ~ name, TRUE ~ ""),
          centrality = centrality_degree()) %>% # can measure which nodes/reefs are 'central'
  ggraph(layout = "fr") +
   geom_edge_link(aes(width = value, alpha = value)) +
   scale_edge_width(range = c(0.2, 2.5)) + # control size
   geom_node_point(aes(shape = attribute, fill = centrality), size = 4) +
  geom_node_text(aes(label = text_label), repel = TRUE) +
   scale_shape_manual(values = c(21, 22)) +
  scale_fill_viridis_c(option = "magma")

pmh8_network

A hum Network

#list of sequences per reef. 
acro_seqreef <- acro_seqs %>% 
distinct(name, Reef)

#removing sequences that are abundant across 5 or more reefs, as these sequences will not be 'diagnostic' if present across many reefs 
remove_list <- acro_seqreef %>% 
  filter(!str_detect(name, "A|G")) %>%
  dplyr::count(name) %>% 
  filter(n >= 12) %>% 
  mutate(name = as.character(name)) %>% 
  pull(name)

#within each reef, get the sum of sequences across all samples collected. 
acro_reefseqs = acro_seqs %>%  
 filter(!(name %in% remove_list)) %>%
  select(name, Reef, sample_name, value ) %>%
  group_by(Reef, name) %>%
  summarise(value = sum(value))  %>%
  select(from = name, to = Reef, value)

acro_network <- acro_reefseqs %>% 
  as_tbl_graph() %>% # convert to tidygraph format
   activate(nodes) %>%
   mutate(attribute = case_when(str_detect(name, "Bougainville|Chilcott|Flinders|Frederick|Herald|Holmes|Lihou|Marion|Moore|Osprey|Saumarez|Willis|Wreck") ~ "Reef", 
                                TRUE ~ "Sequence"), # add node attribute info
          text_label = case_when(attribute == "Reef" ~ name, TRUE ~ ""),
          centrality = centrality_degree()) %>% # can measure which nodes/reefs are 'central'
  ggraph(layout = "fr") +
   geom_edge_link(aes(width = value, alpha = value)) +
   scale_edge_width(range = c(0.2, 2.5)) + # control size
   geom_node_point(aes(shape = attribute, fill = centrality), size = 4) +
    geom_node_text(aes(label = text_label), repel = TRUE) +
   scale_shape_manual(values = c(21, 22)) +
  scale_fill_viridis_c(option = "magma")

acro_network

Fig 5 Network analysis

pver_network + pmh8_network + acro_network